The project aims to analyze the customer purchase behavior (specifically, purchase amount) against the customer’s gender and the various other factors to help the business make better decisions. They want to understand if the spending habits differ between male and female customers: Do women spend more on Black Friday than men?
Through the use of confidence intervals (CI) and bootstrap methods, we identified key trends in spending patterns.
Recommendations:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes = True)
import scipy.stats as stats
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")
#load data
df = pd.read_csv('walmart_data.csv')
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | 7969 |
df.tail()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|
| 550063 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | 368 |
| 550064 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | 371 |
| 550065 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | 137 |
| 550066 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | 365 |
| 550067 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | 490 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 550068 non-null int64 1 Product_ID 550068 non-null object 2 Gender 550068 non-null object 3 Age 550068 non-null object 4 Occupation 550068 non-null int64 5 City_Category 550068 non-null object 6 Stay_In_Current_City_Years 550068 non-null object 7 Marital_Status 550068 non-null int64 8 Product_Category 550068 non-null int64 9 Purchase 550068 non-null int64 dtypes: int64(5), object(5) memory usage: 42.0+ MB
df.shape
(550068, 10)
df.columns
Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category',
'Purchase'],
dtype='object')
df.dtypes
User_ID int64 Product_ID object Gender object Age object Occupation int64 City_Category object Stay_In_Current_City_Years object Marital_Status int64 Product_Category int64 Purchase int64 dtype: object
We will find out the unique values in each column. This will help us understand how varied the data is.
def print_unique_values(df):
for column in df.columns:
unique_values = df[column].unique()
print(f"\nUnique Values of {column}:", unique_values)
print_unique_values(df)
Unique Values of User_ID: [1000001 1000002 1000003 ... 1004113 1005391 1001529] Unique Values of Product_ID: ['P00069042' 'P00248942' 'P00087842' ... 'P00370293' 'P00371644' 'P00370853'] Unique Values of Gender: ['F' 'M'] Unique Values of Age: ['0-17' '55+' '26-35' '46-50' '51-55' '36-45' '18-25'] Unique Values of Occupation: [10 16 15 7 20 9 1 12 17 0 3 4 11 8 19 2 18 5 14 13 6] Unique Values of City_Category: ['A' 'C' 'B'] Unique Values of Stay_In_Current_City_Years: ['2' '4+' '3' '1' '0'] Unique Values of Marital_Status: [0 1] Unique Values of Product_Category: [ 3 1 12 8 5 4 2 6 14 11 13 15 7 16 18 10 17 9 20 19] Unique Values of Purchase: [ 8370 15200 1422 ... 135 123 613]
def print_nunique_values(df):
for column in df.columns:
nunique_values = df[column].nunique()
print(f"\nNumber of Unique Values of {column}:", nunique_values)
print_nunique_values(df)
Number of Unique Values of User_ID: 5891 Number of Unique Values of Product_ID: 3631 Number of Unique Values of Gender: 2 Number of Unique Values of Age: 7 Number of Unique Values of Occupation: 21 Number of Unique Values of City_Category: 3 Number of Unique Values of Stay_In_Current_City_Years: 5 Number of Unique Values of Marital_Status: 2 Number of Unique Values of Product_Category: 20 Number of Unique Values of Purchase: 18105
# change column value in Stay_In_Current_City_Years since it has "+" symbols involved.
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].str.replace('+', '')
# check again
df['Stay_In_Current_City_Years'].unique()
array(['2', '4', '3', '1', '0'], dtype=object)
# convert to numeric value
df['Stay_In_Current_City_Years'] = pd.to_numeric(df['Stay_In_Current_City_Years'])
# select numerical columns to see statistics
df.describe(include='all')
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.500680e+05 | 550068 | 550068 | 550068 | 550068.000000 | 550068 | 550068.000000 | 550068.000000 | 550068.000000 | 550068.000000 |
| unique | NaN | 3631 | 2 | 7 | NaN | 3 | NaN | NaN | NaN | NaN |
| top | NaN | P00265242 | M | 26-35 | NaN | B | NaN | NaN | NaN | NaN |
| freq | NaN | 1880 | 414259 | 219587 | NaN | 231173 | NaN | NaN | NaN | NaN |
| mean | 1.003029e+06 | NaN | NaN | NaN | 8.076707 | NaN | 1.858418 | 0.409653 | 5.404270 | 9263.968713 |
| std | 1.727592e+03 | NaN | NaN | NaN | 6.522660 | NaN | 1.289443 | 0.491770 | 3.936211 | 5023.065394 |
| min | 1.000001e+06 | NaN | NaN | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 1.000000 | 12.000000 |
| 25% | 1.001516e+06 | NaN | NaN | NaN | 2.000000 | NaN | 1.000000 | 0.000000 | 1.000000 | 5823.000000 |
| 50% | 1.003077e+06 | NaN | NaN | NaN | 7.000000 | NaN | 2.000000 | 0.000000 | 5.000000 | 8047.000000 |
| 75% | 1.004478e+06 | NaN | NaN | NaN | 14.000000 | NaN | 3.000000 | 1.000000 | 8.000000 | 12054.000000 |
| max | 1.006040e+06 | NaN | NaN | NaN | 20.000000 | NaN | 4.000000 | 1.000000 | 20.000000 | 23961.000000 |
# check for skewness
df.select_dtypes(include = ['int64', 'float64']).skew()
User_ID 0.003066 Occupation 0.400140 Stay_In_Current_City_Years 0.317236 Marital_Status 0.367437 Product_Category 1.025735 Purchase 0.600140 dtype: float64
Key Observations
df.isnull().sum()
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category 0 Purchase 0 dtype: int64
# check for dups
df.duplicated().sum()
0
[col for col in df.columns if df[col].dtype in ['int64', 'float64']]
['User_ID', 'Occupation', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category', 'Purchase']
User_ID & Product_Category for our analysis since they're irrelevant.fig, axis = plt.subplots(nrows = 2, ncols = 2, figsize = (12, 10))
fig.subplots_adjust(top = 0.8)
# Plot distribution plots for specific columns
sns.distplot(df['Occupation'], kde = True, ax = axis[0, 0], color = '#CD5C5C')
sns.distplot(df['Stay_In_Current_City_Years'], kde = True, ax = axis[0, 1], color = '#CD5C5C')
sns.distplot(df['Marital_Status'], kde = True, ax = axis[1, 0], color = '#CD5C5C')
# Plot distribuition plot for `Purchase` with normal curve fit
sns.distplot(df['Purchase'], kde = True, ax = axis[1, 1], color = '#CD5C5C')
# Fitting target variable to the normal curve
mu, sigma = norm.fit(df['Purchase'])
print(f"Mean: {mu}, Standard Deviation: {sigma}")
# Adding legend to the 'Purchase' distribution plot
axis[1, 1].legend(['Normal Distribution (μ = {:.2f}, σ = {:.2f})'.format(mu, sigma)], loc = 'best')
plt.show()
Mean: 9263.968712959126, Standard Deviation: 5023.060827959928
# Count plots for each feature
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# create subplots
fig = make_subplots(rows = 4, cols = 2,
subplot_titles = ('Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category', 'Purchase')
)
# add histograms for each subplot
fig.add_trace(go.Histogram(x = df['Gender'], name = 'Gender'), row = 1, col = 1)
fig.add_trace(go.Histogram(x = df['Age'], name = 'Age'), row = 1, col = 2)
fig.add_trace(go.Histogram(x = df['Occupation'], name = 'Occupation'), row = 2, col = 1)
fig.add_trace(go.Histogram(x = df['City_Category'], name = 'City_Category'), row = 2, col = 2)
fig.add_trace(go.Histogram(x = df['Stay_In_Current_City_Years']), row=3, col=1)
fig.add_trace(go.Histogram(x = df['Marital_Status']), row=3, col=2)
fig.add_trace(go.Histogram(x = df['Product_Category']), row=4, col=1)
fig.add_trace(go.Histogram(x = df['Purchase']), row=4, col=2)
# adjust layout
fig.update_layout(height = 1000, width = 1200, showlegend = False, title_text = 'Count Plots for Each Feature')
fig.show()